import pandas as pd
import seaborn as sns
import numpy as np
import plotly.express as px
# Import data
df = pd.read_csv("gapminder_clean.csv")
#Remove NaN values for "CO2 emissions (metric tons per capita)" stats
df_CO2_clean = df.copy()
df_CO2_clean.dropna(subset = ["CO2 emissions (metric tons per capita)"], inplace=True)
df_CO2_clean.dropna(subset = ["gdpPercap"], inplace=True)
filtered_data = df_CO2_clean[(df_CO2_clean["Year"] == 1962) & (df_CO2_clean["Country Name"] != "Kuwait")]
# Scatter plot
fig = px.scatter(filtered_data, x='CO2 emissions (metric tons per capita)',
y="gdpPercap", color='Country Name', size='pop', hover_data=['Country Name'],
title="CO2 emissions (metric tons per capita) and GDP per cap. Year 1962")
fig.show()
# Pearson's r
from scipy.stats import pearsonr
corr, p_value = pearsonr(filtered_data["CO2 emissions (metric tons per capita)"], filtered_data["gdpPercap"])
print("\n Pearson correlation of CO2 emissions (metric tons per capita) and gdpPercap, year 1962: \n",
"Correlation value: ",corr,"p-value: ",p_value)
unfiltered_data = df_CO2_clean[(df_CO2_clean["Year"] != 1962) & (df_CO2_clean["Country Name"] != "Kuwait")]
unfiltered_data.groupby(by=["Year"]).corrwith(other=df_CO2_clean["CO2 emissions (metric tons per capita)"]).sort_values("gdpPercap", ascending=False)["gdpPercap"].head(1)
new_filtered_data = df_CO2_clean[(df_CO2_clean["Year"] == 1972) & (df_CO2_clean["Country Name"] != "Kuwait")]
fig = px.scatter(new_filtered_data, x='CO2 emissions (metric tons per capita)', y="gdpPercap", color="continent",
size='pop', hover_data=['Country Name'],
title="CO2 emissions (metric tons per capita) and GDP per cap. Year 1972")
fig.show()
df.groupby("continent")["Energy use (kg of oil equivalent per capita)"].describe()
fig = px.box(df, x="Energy use (kg of oil equivalent per capita)", y="continent", hover_data=['Country Name'])
fig.show()
I need to know if the data satisfy parametric requirements to use parametric tests.
First, I separate the "Energy use" data per continent, (I'll have five series, one per each continent), and I remove missing values to avoid errors on tests.
americas_energy = df[df["continent"] == "Americas"]["Energy use (kg of oil equivalent per capita)"].dropna()
oceania_energy = df[df["continent"] == "Oceania"]["Energy use (kg of oil equivalent per capita)"].dropna()
africa_energy = df[df["continent"] == "Africa"]["Energy use (kg of oil equivalent per capita)"].dropna()
europe_energy = df[df["continent"] == "Europe"]["Energy use (kg of oil equivalent per capita)"].dropna()
asia_energy = df[df["continent"] == "Asia"]["Energy use (kg of oil equivalent per capita)"].dropna()
The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution (first requeriment to use parametric tests).
import scipy.stats as stats
print("\n",
"Americas: ",stats.shapiro(americas_energy), " Reject null hypotesis","\n",
"Oceania: ",stats.shapiro(oceania_energy),"Can not reject null hypotesis","\n",
"Africa: ",stats.shapiro(africa_energy)," Reject null hypotesis","\n",
"Europe: ",stats.shapiro(europe_energy)," Reject null hypotesis","\n",
"Asia: ",stats.shapiro(asia_energy)," Reject null hypotesis")
Shapiro-Wilk tests: rejected 4/5 null hypotheses. Data is not normal distributed.
The Levene test tests the null hypothesis that all input samples are from populations with equal variances (third requeriment to use parametric tests).
import scipy.stats as stats
stats.levene(*[americas_energy, oceania_energy, africa_energy,europe_energy, asia_energy],
center='median', proportiontocut=0.05)
Leneve test: rejected null hypothesis.
Shapiro-Wilk and Leneve tests were rejected. Data doesn't satisfy parametric requeriments. So I need to use non-parametric tests.
The Kruskal-Wallis H-test tests the null hypothesis that the population median of all of the groups are equal.
stats.kruskal(*[americas_energy, oceania_energy, africa_energy,europe_energy, asia_energy])
Kruskal-Wallis H-test: rejected null hypothesis.
So I have to compare the means to detect the differences and similarities between continents.
Post hoc pairwise test for multiple comparisons of mean rank sums. This test is run after Kruskal-Wallis's one-way analysis of variance by ranks to do pairwise comparisons.
import scikit_posthocs as sp
dunn_test = sp.posthoc_dunn([americas_energy, oceania_energy, africa_energy,europe_energy, asia_energy])
dunn_test.columns =["America","Oceania","Africa","Europe","Asia"]
dunn_test.index =["America","Oceania","Africa","Europe","Asia"]
dunn_test
fig = px.imshow(dunn_test, title="Dunn's test between continents' energy use means")
fig.show()
Asia's and Americas' energy use means are similar.
Oceania's and Europe's energy use means are similar.
europe_and_asia_after_1990 = df[((df["continent"] == "Europe") |
(df["continent"] == "Asia")) & (df["Year"] > 1990) &
(df['Imports of goods and services (% of GDP)'] < 97 ) ] # Deleted Outliers (Singapore is an exception)
europe_and_asia_after_1990.groupby("continent")["Imports of goods and services (% of GDP)"].describe()
fig = px.box(europe_and_asia_after_1990, x="Imports of goods and services (% of GDP)", y="continent", hover_data=['Country Name', "Year"])
fig.show()
europe_imports = europe_and_asia_after_1990[europe_and_asia_after_1990["continent"] == "Europe"]["Imports of goods and services (% of GDP)"].dropna()
asia_imports = europe_and_asia_after_1990[europe_and_asia_after_1990["continent"] == "Asia"]["Imports of goods and services (% of GDP)"].dropna()
print("\n", stats.shapiro(europe_imports), "\n", stats.shapiro(asia_imports))
stats.levene(*[europe_imports,asia_imports], center='median', proportiontocut=0.05)
Like the anterior case, parametric requirements are not satisfied. I need to compare two means using a non-parametric test.
The Mann-Whitney U test is used to compare differences between two independent groups when the dependent variable is either ordinal or continuous, but not normally distributed.
stats.mannwhitneyu(x=europe_imports, y=asia_imports)
Can not reject the null hypothesis of identical average scores.
I'll group the records by Country Name, then I'll calculate the pop density mean per country
df.groupby("Country Name")["Population density (people per sq. km of land area)"].mean().sort_values(ascending=False).head()
fig = px.line(df, x="Year", y="Population density (people per sq. km of land area)", color="Country Name",
line_group="Country Name", hover_name="Country Name",
title="Population density (people per sq. km of land area) across all years")
fig.show()
I'll extract the first and the last record that contains the "Life expectancy at birth" value for each country (not the minimum and maximum values), I'll subtract the last minus the first value and then I'll calculate the relative increment, in percentage:
relative increment (%) = (last record - first record)/first record * 100
life_expectancy_by_country = df.groupby('Country Name')['Life expectancy at birth, total (years)'].agg(['last','first'])
life_expectancy_by_country['diff'] = life_expectancy_by_country['last'] - life_expectancy_by_country['first']
life_expectancy_by_country['percentage'] = life_expectancy_by_country['diff'] / life_expectancy_by_country['first'] * 100
life_expectancy_by_country["Country Name"] = life_expectancy_by_country.index
life_expectancy_by_country.sort_values(by="percentage", ascending=False).head()
life_expectancy_by_country_values_greater_than_zero = life_expectancy_by_country[life_expectancy_by_country["percentage"] > 0] #Removed negative values because cause error
fig = px.scatter(life_expectancy_by_country_values_greater_than_zero, x='percentage', y="last", color="percentage",
title="Changes in life expectancy at birth, relative value (%)",
size='percentage', hover_data=['Country Name'],
labels={
"last": "Life expectancy at birth, total (years). Last record.",
"percentage": "Difference between first and last record, (percentage)"
},)
fig.show()
Also I can compare between absolute values, getting different results:
absolute increment (years) = last record - first record
life_expectancy_by_country.sort_values(by="diff", ascending=False).head()
life_expectancy_by_country_values_greater_than_zero = life_expectancy_by_country[life_expectancy_by_country["diff"] > 0] #Removed negative values because cause error
fig = px.scatter(life_expectancy_by_country_values_greater_than_zero, x='diff', y="last", color="diff",
title="Changes in life expectancy at birth, absolute value (Years)",
size='diff', hover_data=['Country Name'],
labels={
"last": "Life expectancy at birth, total (years). Last record.",
"diff": "Difference between first and last record, (years)"
},)
fig.show()